{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import collections\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "import re\n",
    "\n",
    "from argparse import Namespace"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "args = Namespace(\n",
    "    raw_train_dataset_csv=\"data/yelp/raw_train.csv\",\n",
    "    raw_test_dataset_csv=\"data/yelp/raw_test.csv\",\n",
    "    proportion_subset_of_train=0.1,\n",
    "    train_proportion=0.7,\n",
    "    val_proportion=0.15,\n",
    "    test_proportion=0.15,\n",
    "    output_munged_csv=\"data/yelp/reviews_with_splits_lite.csv\",\n",
    "    seed=1337\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Read raw data\n",
    "train_reviews = pd.read_csv(args.raw_train_dataset_csv, header=None, names=['rating', 'review'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# making the subset equal across the review classes\n",
    "by_rating = collections.defaultdict(list)\n",
    "for _, row in train_reviews.iterrows():\n",
    "    by_rating[row.rating].append(row.to_dict())\n",
    "    \n",
    "review_subset = []\n",
    "\n",
    "for _, item_list in sorted(by_rating.items()):\n",
    "\n",
    "    n_total = len(item_list)\n",
    "    n_subset = int(args.proportion_subset_of_train * n_total)\n",
    "    review_subset.extend(item_list[:n_subset])\n",
    "\n",
    "review_subset = pd.DataFrame(review_subset)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rating</th>\n",
       "      <th>review</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>1</td>\n",
       "      <td>Unfortunately, the frustration of being Dr. Go...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>1</td>\n",
       "      <td>I don't know what Dr. Goldberg was like before...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>1</td>\n",
       "      <td>I'm writing this review to give you a heads up...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>1</td>\n",
       "      <td>Wing sauce is like water. Pretty much a lot of...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>1</td>\n",
       "      <td>Owning a driving range inside the city limits ...</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "   rating                                             review\n",
       "0       1  Unfortunately, the frustration of being Dr. Go...\n",
       "1       1  I don't know what Dr. Goldberg was like before...\n",
       "2       1  I'm writing this review to give you a heads up...\n",
       "3       1  Wing sauce is like water. Pretty much a lot of...\n",
       "4       1  Owning a driving range inside the city limits ..."
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "review_subset.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2    280000\n",
       "1    280000\n",
       "Name: rating, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "train_reviews.rating.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2    28000\n",
       "1    28000\n",
       "Name: rating, dtype: int64"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "review_subset.rating.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{1, 2}"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Unique classes\n",
    "set(review_subset.rating)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Splitting the subset by rating to create our new train, val, and test splits\n",
    "by_rating = collections.defaultdict(list)\n",
    "for _, row in review_subset.iterrows():\n",
    "    by_rating[row.rating].append(row.to_dict())\n",
    "    \n",
    "final_list = []\n",
    "np.random.seed(args.seed)\n",
    "\n",
    "for _, item_list in sorted(by_rating.items()):\n",
    "\n",
    "    np.random.shuffle(item_list)\n",
    "    \n",
    "    n_total = len(item_list)\n",
    "    n_train = int(args.train_proportion * n_total)\n",
    "    n_val = int(args.val_proportion * n_total)\n",
    "    n_test = int(args.test_proportion * n_total)\n",
    "    \n",
    "    # Give data point a split attribute\n",
    "    for item in item_list[:n_train]:\n",
    "        item['split'] = 'train'\n",
    "    \n",
    "    for item in item_list[n_train:n_train+n_val]:\n",
    "        item['split'] = 'val'\n",
    "        \n",
    "    for item in item_list[n_train+n_val:n_train+n_val+n_test]:\n",
    "        item['split'] = 'test'\n",
    "\n",
    "    # Add to final list\n",
    "    final_list.extend(item_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create split data\n",
    "final_list = []\n",
    "np.random.seed(args.seed)\n",
    "\n",
    "for _, item_list in sorted(by_rating.items()):\n",
    "\n",
    "    np.random.shuffle(item_list)\n",
    "    \n",
    "    n_total = len(item_list)\n",
    "    n_train = int(args.train_proportion * n_total)\n",
    "    n_val = int(args.val_proportion * n_total)\n",
    "    n_test = int(args.test_proportion * n_total)\n",
    "    \n",
    "    # Give data point a split attribute\n",
    "    for item in item_list[:n_train]:\n",
    "        item['split'] = 'train'\n",
    "    \n",
    "    for item in item_list[n_train:n_train+n_val]:\n",
    "        item['split'] = 'val'\n",
    "        \n",
    "    for item in item_list[n_train+n_val:n_train+n_val+n_test]:\n",
    "        item['split'] = 'test'\n",
    "\n",
    "    # Add to final list\n",
    "    final_list.extend(item_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Write split data to file\n",
    "final_reviews = pd.DataFrame(final_list)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "train    39200\n",
       "test      8400\n",
       "val       8400\n",
       "Name: split, dtype: int64"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_reviews.split.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Preprocess the reviews\n",
    "def preprocess_text(text):\n",
    "    text = text.lower()\n",
    "    text = re.sub(r\"([.,!?])\", r\" \\1 \", text)\n",
    "    text = re.sub(r\"[^a-zA-Z.,!?]+\", r\" \", text)\n",
    "    return text\n",
    "    \n",
    "final_reviews.review = final_reviews.review.apply(preprocess_text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_reviews['rating'] = final_reviews.rating.apply({1: 'negative', 2: 'positive'}.get)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>rating</th>\n",
       "      <th>review</th>\n",
       "      <th>split</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>negative</td>\n",
       "      <td>all i can say is that a i had no other option ...</td>\n",
       "      <td>train</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>negative</td>\n",
       "      <td>i went here once when my long time stylist mov...</td>\n",
       "      <td>train</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>negative</td>\n",
       "      <td>i don t know why i stopped here for lunch this...</td>\n",
       "      <td>train</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>negative</td>\n",
       "      <td>did i order the wrong thing ? or maybe it was ...</td>\n",
       "      <td>train</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>negative</td>\n",
       "      <td>i went here for restaurant week . the restaura...</td>\n",
       "      <td>train</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "     rating                                             review  split\n",
       "0  negative  all i can say is that a i had no other option ...  train\n",
       "1  negative  i went here once when my long time stylist mov...  train\n",
       "2  negative  i don t know why i stopped here for lunch this...  train\n",
       "3  negative  did i order the wrong thing ? or maybe it was ...  train\n",
       "4  negative  i went here for restaurant week . the restaura...  train"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "final_reviews.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [],
   "source": [
    "final_reviews.to_csv(args.output_munged_csv, index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "nlpbook",
   "language": "python",
   "name": "nlpbook"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.2"
  },
  "toc": {
   "colors": {
    "hover_highlight": "#DAA520",
    "running_highlight": "#FF0000",
    "selected_highlight": "#FFD700"
   },
   "moveMenuLeft": true,
   "nav_menu": {
    "height": "12px",
    "width": "252px"
   },
   "navigate_menu": true,
   "number_sections": true,
   "sideBar": true,
   "threshold": "5",
   "toc_cell": false,
   "toc_section_display": "block",
   "toc_window_display": false
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
